# Directory structure
github_dir <- file.path("G:\\Shared drives\\Nord Lab - Computational Projects\\MIA\\eLIFE_Clean_code_for_GitHub")

setwd(github_dir)


# Global R markdown code chunk options
knitr::opts_chunk$set(message=FALSE, 
                      warning = FALSE, 
                      error=FALSE, 
                      echo=TRUE, 
                      cache=TRUE, 
                      fig.width = 7, fig.height = 6, 
                      fig.align = 'left')

# R packages
library(sva)
library(RUVnormalize)
library(pheatmap)
library(edgeR)
library(GenomicFeatures)
library(mclust)
library(parallel)
library(ggplot2)
library(reshape2)
library(dplyr)
library(data.table)
library(DT)
library(RColorBrewer)
library(knitr)
library(ggrepel)
library(gridExtra)
library(grid)
library(plyr)
library(plotly)
library(Hmisc)
# Reads input files

setwd("G:/Shared drives/Nord Lab - Computational Projects/MIA/eLIFE_Clean_code_for_GitHub/")

exp.data <- read.csv("./GEO_submission files/Gene_counts.csv")
rownames(exp.data) <- exp.data$X
exp.data$X <- NULL

sample.info <- read.csv("./GEO_submission files/metadata.csv")

load("./GEO_submission files/exonic.gene.sizes")

Sample metadata

sample.info_submission <- read.csv("GEO_submission files/Supplementary Table 5, RNA-seq sample metadata_03_02_2021.csv")

datatable(sample.info_submission, 
          rownames = FALSE, 
          filter = list(position = 'top', clear = FALSE, plain = FALSE), 
          options = list(pageLength = 10, scrollX=T, scrollY=T))

Count data QC

Identification of sample sex by Xist gene read counts

A threshold of 1000 counts of Xist gene was used to identify sex

# Qualifies F or M based on the Xist expression
sex.by.rna <- c(ifelse(exp.data["Xist",]>1000,"F","M")) #

Xist_exp <- as.data.frame(reshape2::melt(exp.data["Xist",]))
Xist_exp <- cbind(Xist_exp, sex.by.rna)
Xist_exp <- arrange(Xist_exp, value)
colnames(Xist_exp) <- c("variable", "counts", "sex.by.rna")

ggplot(Xist_exp, aes(x=sex.by.rna, y=counts, colours=sex.by.rna))+
  geom_jitter()+
  geom_boxplot(alpha=0.2)+
  theme_bw()+
  labs(title="Xist read counts", x = "", y = "Read counts")+
  theme(plot.title = element_text(size = rel(2), hjust=0.5))+
  scale_x_discrete(breaks= c("F", "M"), labels=c("Females", "Males"))

RPKM calculation and filtering of genes with very low expression

# Calculates RPKM values
# Gene lengths calculated with lapply
gene.lengths <- as.numeric(lapply(1:nrow(exp.data), function(x) FUN= as.numeric(exonic.gene.sizes[rownames(exp.data)[x]])))

#
rpkm.data <- rpkm(exp.data, gene.length=gene.lengths, log=T, prior.count=.25)
rpkm.data_linear <- rpkm(exp.data, gene.length=gene.lengths, log=F)
rpkm.data_linear_all <- rpkm.data_linear

#write.csv(rpkm.data, file = "GEO_submission #files/rpkm_log2_24015.csv")
#write.csv(rpkm.data_linear, file = "GEO_submission #files/rpkm_linear_24015.csv")


### Removes genes with low expression ###

# Lets plot a histogram of all rpkm values in a dataset.
#dim(rpkm.data) #24015 rows and 74 columns

df <- reshape2::melt(rpkm.data)
df$rpkm_linear <- 2^df$value # This converts the log2 RPKM values back to linear scale
colnames(df) <- c("gene_name", "sample", "rpkm_log2","rpkm_linear")

# Histogram of all rpkm values in a dataset.
ggplot(df, aes(x=rpkm_log2))+ 
  geom_histogram(bins = 1000, color="black")+
  theme_bw()+
  labs(title="Unfiltered dataset of 24015 genes", 
       x="log2 RPKM", y = "Read counts")+
  theme(plot.title = element_text(hjust = 0.5))

# Setting the threshold for minimum rpkm value in at least 2 samples. 
threshold = -2

# Nymber of genes after filtering
#sum(reshape2::melt(rowSums(rpkm.data > threshold) >= 2)$value) # 17051

# Let's filter the dataset and plot the histogram distribution
keep <- as.data.frame(reshape2::melt(rowSums(rpkm.data > threshold) >= 2))
keep$gene_name <- rownames(keep)
keep <- filter(keep, value == "TRUE")$gene_name

# NOTE: The set of included genes reported in the manuscript was determined using expression criteria from an expanded set of samples including a condition that was not used for the manuscript. The difference between the gene counts is 17195 in the manuscript from the larger set of samples, compared to 17051 genes that would pass in the set of samples used in the MIA vs. control comparison.  Note that there is no difference among DEG passing FDR < 0.05 using either gene set.  

original_exp.data <- read.csv("GEO_submission files/Count_gene_set_from_initial_submission.csv")

# length(keep) # 17051
# length(original_exp.data$gene_name) # 17195

keep <- original_exp.data$gene_name

# 
datExpr_test <- as.data.frame(rpkm.data)
datExpr_test$gene_name <- rownames(datExpr_test)

# Filters the "keep" genes 
datExpr_test <- filter(datExpr_test, gene_name %in% keep)

# Generates a matrix-type data frame
rownames(datExpr_test) <- datExpr_test$gene_name
datExpr_test$gene_name <- NULL

# Overwrites the original datExpr object.
datExpr <- datExpr_test


# Plots the histogram after filtering
df <- reshape2::melt(datExpr)
df$rpkm_linear <- 2^df$value # This converts the log2 RPKM values back to linear scale
colnames(df) <- c("sample", "rpkm_log2","rpkm_linear")

ggplot(df, aes(x=rpkm_log2))+
  geom_histogram(bins = 1000, color="black")+
  theme_bw()+
  labs(title = "Dataset of 17195 genes \n after excluding genes expressed at a very low level", x="log2 RPKM", y = "Read counts")+
  theme(plot.title = element_text(hjust = 0.5))

# Removing low expression genes. Overwrites exp.data object used for DE analysis ### 
#dim(exp.data)  #exp.data contains counts, datExpr contains rpkms 

exp.data$gene_name <- rownames(exp.data)
exp.data <- filter(exp.data, gene_name %in% rownames(datExpr))

rownames(exp.data) <- exp.data$gene_name
exp.data$gene_name <- NULL 


# Recalculates rpkm values
# Gene lengths calculated with lapply
gene.lengths <- as.numeric(lapply(1:nrow(exp.data), function(x) FUN= as.numeric(exonic.gene.sizes[rownames(exp.data)[x]])))

# RPKM calculation
rpkm.data <- rpkm(exp.data, gene.length=gene.lengths, log=T, prior.count=.25)
rpkm.data_linear <- rpkm(exp.data, gene.length=gene.lengths, log=F)


#write.csv(rpkm.data, file = "GEO_submission #files/rpkm_log2_17051.csv")
#write.csv(rpkm.data_linear, file = "GEO_submission #files/rpkm_linear_17051.csv")

PCA plots

setwd(github_dir)

group <- ifelse(sample.info[,"Condition"]=="Saline",1,2)
rRNA <- as.numeric(sample.info$rRNA)
sex.by.rna <- factor(sample.info$sex.by.rna)
dpc <- factor(sample.info$DPC)
response <- factor(sample.info$Response)
lane <- factor(sample.info$Lane)

source("Dimensionality reduction plots.r")

grid.arrange(PCA_plot, PCA_plot_sex, ncol = 2)

Saline vs PolyIC developmental time prediction linear model

# 19.5 was used as a numerical representation of the P0 time point.

test.dpc <- as.factor(sample.info$DPC)
test.sex.by.rna <- as.factor(sample.info$sex_by_rna)
test.response <- as.factor(sample.info$Response)
test.rRNA <- sample.info$rRNA
test.group <- as.factor(sample.info$Condition)
test.lane <- as.factor(sample.info$Lane)
test.data <- exp.data

min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria

#qCML expression modeling
y <- DGEList(counts=test.data, group=test.group)
keep <- rowSums(cpm(y)>min.cpm) >=2
y <- y[keep, , keep.lib.sizes=FALSE]
## Perform simple exact test on group
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)


test.pseudo <- y$pseudo.counts
pca.results <- prcomp(test.pseudo)

#PCA dataframe does not contain all samples as the exp.data, so I have to find which datapoints are control and polyic now
control.datapoints <- which(sample.info$Condition == "Saline")
polyic.datapoints <- which(sample.info$Condition == "PolyIC")

train.data <- data.frame(PCA.1=pca.results$rotation[control.datapoints,1],
                         PCA.2=pca.results$rotation[control.datapoints,2],
                         PCA.3=pca.results$rotation[control.datapoints,3],
                         PCA.4=pca.results$rotation[control.datapoints,4],
                         PCA.5=pca.results$rotation[control.datapoints,5],
                         rRNA=as.numeric(test.rRNA[control.datapoints]),
                         sex=sex.by.rna[control.datapoints],
                         lane=test.lane[control.datapoints],
                         response=test.response[control.datapoints]
)

predict.data <- data.frame(PCA.1=pca.results$rotation[polyic.datapoints,1],
                         PCA.2=pca.results$rotation[polyic.datapoints,2],
                         PCA.3=pca.results$rotation[polyic.datapoints,3],
                         PCA.4=pca.results$rotation[polyic.datapoints,4],
                         PCA.5=pca.results$rotation[polyic.datapoints,5],
                         rRNA=as.numeric(test.rRNA[polyic.datapoints]),
                         sex=sex.by.rna[polyic.datapoints],
                         lane=test.lane[polyic.datapoints],
                         response=test.response[polyic.datapoints]
)

lm.model <- lm(as.numeric(as.character(test.dpc[control.datapoints])) ~
                 PCA.1 + PCA.2 + PCA.3 + PCA.4 + PCA.5 + rRNA + sex + response
               , data = train.data
)


# Predicts DPC from the linear model built on train data.
predict.saline <- predict(lm.model, newdata = train.data)
predict.polyic <- predict(lm.model, newdata = predict.data)

# I had some trouble with the original for loop and I replaced it with lapply. I also added the sd values and a plot representing modeled DPC values.
dpc.predict.mean.saline <- lapply(1:4, function(x) FUN=mean(predict.saline[which(as.character(test.dpc[control.datapoints])==unique(test.dpc)[x])]))

dpc.predict.sd.saline <- lapply(1:4, function(x) FUN=sd(predict.saline[which(as.character(test.dpc[control.datapoints])==unique(test.dpc)[x])]))

dpc.predict.mean.polyic <- lapply(1:4, function(x) FUN=mean(predict.polyic[which(as.character(test.dpc[polyic.datapoints])==unique(test.dpc)[x])]))

dpc.predict.sd.polyic <- lapply(1:4, function(x) FUN=sd(predict.polyic[which(as.character(test.dpc[polyic.datapoints])==unique(test.dpc)[x])]))

dpc.predict.mean.saline <- as.numeric(dpc.predict.mean.saline)
dpc.predict.sd.saline <- as.numeric(dpc.predict.sd.saline)
dpc.predict.mean.polyic <- as.numeric(dpc.predict.mean.polyic)
dpc.predict.sd.polyic <- as.numeric(dpc.predict.sd.polyic)


lm.predicted.dpc <- data.frame(dpc.predict.mean.saline, dpc.predict.sd.saline, dpc.predict.mean.polyic, dpc.predict.sd.polyic)


####
df_1 <- data.frame("Actual_DPC" = as.numeric(as.character(test.dpc[control.datapoints])),
                   "Predicted_DPC" = predict.saline, "Condition" = rep("Saline", length(predict.saline)))


df_2 <- data.frame("Actual_DPC" = as.numeric(as.character(test.dpc[polyic.datapoints])),
                   "Predicted_DPC" = predict.polyic, "Condition" = rep("Poly(I:C)", length(predict.polyic)))

df_combined <- rbind(df_1, df_2)


mean_data <- group_by(df_combined, Actual_DPC, Condition) %>%
             summarise(average_predicted_dpc = mean(Predicted_DPC, na.rm = TRUE),
                       sd_predicted_dpc = sd(Predicted_DPC, na.rm = TRUE)
                       )

mean_data <- as.data.frame(mean_data)

j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]


df_combined$Condition <- ifelse(df_combined$Condition == "Saline", "Control", as.character(df_combined$Condition))
mean_data$Condition <- ifelse(mean_data$Condition == "Saline", "Control", as.character(mean_data$Condition))


Fig_4c <- ggplot()+
  geom_jitter(data=df_combined, aes(x = Actual_DPC, y = Predicted_DPC,  color=Condition), size = 2, height= 0.3, alpha=0.7)+
  geom_line(data=mean_data, aes(x = Actual_DPC, y = average_predicted_dpc, color=Condition), size=1.5, alpha=0.7)+
  theme_bw()+
  scale_color_manual(values=c("Control"=j_brew_colors[2], "Poly(I:C)" = j_brew_colors[1]))+
  scale_x_continuous(breaks= c(12.5, 14.5, 17.5, 19.5), labels=c("E12.5", "E14.5", "E17.5", "P0"))+
  scale_y_continuous(breaks= c(12.5, 14.5, 17.5, 19.5), labels=c("E12.5", "E14.5", "E17.5", "P0"))+
  labs(y = "Predicted DPC", x = "Actual DPC")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

Fig_4c

Statistical analysis of the developmental timing prediction

#Nicer dataframe
predicted.dpc.df <- data.frame("Real_DPC"=c(12.5, 14.5, 17.5, 19.5), "Predicted_Saline_Mean_DPC"= dpc.predict.mean.saline, "Predicted_Saline_SD_DPC"=dpc.predict.sd.saline, "Predicted_PolyIC_Mean_DPC"= dpc.predict.mean.polyic, "Predicted_PolyIC_SD_DPC"=dpc.predict.sd.polyic)

##### T test ####
t.test.12 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="12.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="12.5")])

t.test.14 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="14.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="14.5")])

t.test.17 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="17.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="17.5")])

t.test.19 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="19.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="19.5")])

t.test.modeled.DPC.p.val <- data.frame("p-value E12.5"=t.test.12$p.value, "p-value E14.5"=t.test.14$p.value, "p-value E17.5"=t.test.17$p.value, "p-value E19.5"=t.test.19$p.value)

predicted_DPC_df <- reshape2::melt(data.frame("mean E12.5"=t.test.12$estimate, "mean E14.5"=t.test.14$estimate, "mean E17.5"=t.test.17$estimate, "mean E19.5"=t.test.19$estimate))


t_test_p_value=c(t.test.12$p.value, t.test.14$p.value, t.test.17$p.value, t.test.19$p.value)
specify_decimal <- function(x, k) trimws(format(round(x, k), nsmall=k))

predicted_DPC_df_nice <- data.frame("Real_DPC"= c(12.5, 14.5, 17.5, 19.5), "Predicted_DPC_Saline"=predicted_DPC_df$value[c(1,3,5,7)], "Predicted_DPC_PolyIC"=predicted_DPC_df$value[c(2,4,6,8)], "t_test_p_value"=specify_decimal(t_test_p_value, 8))

knitr::kable(predicted_DPC_df_nice)
Real_DPC Predicted_DPC_Saline Predicted_DPC_PolyIC t_test_p_value
12.5 12.95995 12.95008 0.98141714
14.5 14.63756 15.34472 0.12438056
17.5 16.68691 19.01691 0.00000121
19.5 19.48706 19.27017 0.40513411

Differential gene expression

single_timepoint_glm_function<- function(x){

control.datapoints <- intersect(control.datapoints, which(dpc == x))
polyic.datapoints <- intersect(polyic.datapoints, which(dpc == x))

use.cols <- c(control.datapoints, polyic.datapoints)

test.dpc <- dpc[use.cols]
test.sex.by.rna <- sex.by.rna[use.cols]
test.response <- response[use.cols]
test.rRNA <- as.numeric(rRNA)[use.cols]
test.group <- group[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]

min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria

design <- model.matrix(~as.factor(test.lane) + as.factor(test.sex.by.rna) + as.numeric(test.group))
y <- DGEList(counts=test.data, group=group[use.cols])
keep <- rowSums(cpm(y)>min.cpm) >=2 #keeps only genes expressed in above min.cpm in at least 2 libraries in each group 
y <- y[keep, , keep.lib.sizes=FALSE]
y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)
y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design) 
lrt <- glmLRT(fit) # Genewise Negative Binomial Generalized Linear Models.

glm.output <- topTags(lrt, n=Inf)
glm.output.full <- glm.output$table

glm.output.full$gene_name <- rownames(glm.output.full)
rownames(glm.output.full) <- NULL
glm.output.full <- glm.output.full[,c(6,1:5)]

glm.output.full

}

E12.5_GLM <- single_timepoint_glm_function(12.5)
E14.5_GLM <- single_timepoint_glm_function(14.5)
E17.5_GLM <- single_timepoint_glm_function(17.5)
E19.5_GLM <- single_timepoint_glm_function(19.5)

Numbers of DE genes

E12.5_up_n <- nrow(filter(E12.5_GLM, PValue < 0.05, logFC > 0))
E12.5_down_n <- nrow(filter(E12.5_GLM, PValue < 0.05, logFC < 0))

E14.5_up_n <- nrow(filter(E14.5_GLM, PValue < 0.05, logFC > 0))
E14.5_down_n <- nrow(filter(E14.5_GLM, PValue < 0.05, logFC < 0))

E17.5_up_n <- nrow(filter(E17.5_GLM, PValue < 0.05, logFC > 0))
E17.5_down_n <- nrow(filter(E17.5_GLM, PValue < 0.05, logFC < 0))

E19.5_up_n <- nrow(filter(E19.5_GLM, PValue < 0.05, logFC > 0))
E19.5_down_n <- nrow(filter(E19.5_GLM, PValue < 0.05, logFC < 0))

###

E12.5_up_n_FDR <- nrow(filter(E12.5_GLM, FDR < 0.05, logFC > 0))
E12.5_down_n_FDR <- nrow(filter(E12.5_GLM, FDR < 0.05, logFC < 0))

E14.5_up_n_FDR <- nrow(filter(E14.5_GLM, FDR < 0.05, logFC > 0))
E14.5_down_n_FDR <- nrow(filter(E14.5_GLM, FDR < 0.05, logFC < 0))

E17.5_up_n_FDR <- nrow(filter(E17.5_GLM, FDR < 0.05, logFC > 0))
E17.5_down_n_FDR <- nrow(filter(E17.5_GLM, FDR < 0.05, logFC < 0))

E19.5_up_n_FDR <- nrow(filter(E19.5_GLM, FDR < 0.05, logFC > 0))
E19.5_down_n_FDR <- nrow(filter(E19.5_GLM, FDR < 0.05, logFC < 0))


DE_df_n <- t(data.frame("E12.5" =c(E12.5_up_n, E12.5_down_n, E12.5_up_n_FDR, E12.5_down_n_FDR),
           "E14.5" =c(E14.5_up_n, E14.5_down_n, E14.5_up_n_FDR, E14.5_down_n_FDR),
           "E17.5" =c(E17.5_up_n, E17.5_down_n, E17.5_up_n_FDR, E17.5_down_n_FDR),
           "P0" =c(E19.5_up_n, E19.5_down_n, E19.5_up_n_FDR, E19.5_down_n_FDR)))

colnames(DE_df_n) <- c("Upregulated", "Downregulated", "Upregulated", "Downregulated")


DE_df_n_melted <- melt(DE_df_n)

DE_df_n_melted$stat <- c(rep(", P < 0.05", 8), rep(", FDR < 0.05", 8))

DE_df_n_melted$Var2_stat <- paste(DE_df_n_melted$Var2, DE_df_n_melted$stat)


Fig1f <- ggplot(DE_df_n_melted, aes(fill=Var2_stat, group=Var2, x=Var1, y=value))+
  geom_bar(position = "dodge",  stat="identity", color="black")+
   theme_bw()+
  #scale_fill_manual(values=brewer.pal(n = 8, name = "Paired")[c(6,6,2,2 )])+
  scale_fill_manual(values=c("#1F78B4", "#62a0ca", "#E31A1C", "#eb5e60"))+
  theme(legend.title=element_blank())+
  labs(title="Number of DE genes with P < 0.05 and FDR < 0.05", x="", y="")+
  theme(plot.title = element_text(size = rel(1.2), hjust=0.5))+
  geom_text(aes(label=value), position=position_dodge(width=0.9), hjust=-0.2)+
  coord_flip()+
  scale_x_discrete(limits = rev(levels(DE_df_n_melted$Var1)))+
  theme(panel.border = element_blank(),
        #legend.key = element_blank(),
        axis.ticks = element_blank(),
        axis.text  = element_blank(),
        axis.text.y = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  theme(legend.position="bottom")

Fig1f

Volcano plots

volcano_plot_text_2 <- function(x, title) {

#This is nicely dealing with datasets with missing cathegories.
test <- ifelse(x$PValue, "Non significant")
test <- ifelse(x$logFC > 0 & x$PValue <0.05, "PValue < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$PValue <0.05, "PValue < 0.05 & logFC < 0", test)
test <- ifelse(x$logFC > 0 & x$FDR <0.05, "FDR < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$FDR <0.05, "FDR < 0.05 & logFC < 0", test)

plotDat <- data.frame(x, Group=test)

plotDat$logFC <- ifelse(plotDat$logFC > 4, 4, plotDat$logFC)
plotDat$logFC <- ifelse(plotDat$logFC < -4, -4, plotDat$logFC)


p <- ggplot(plotDat, aes(x = logFC, y=-log10(PValue), fill=Group, col = Group)) +
  geom_point(aes(text=gene_name), size=0.5, pch=21, alpha=0.7, stroke = 0.5)+
  theme_light()+
 scale_fill_manual(values=c("Non significant"="grey30", 
                             "PValue < 0.05 & logFC > 0"="#eb5e60", 
                             "PValue < 0.05 & logFC < 0"="#62a0ca", 
                             "FDR < 0.05 & logFC > 0" = "#960304", 
                             "FDR < 0.05 & logFC < 0" = "#01538a"))+
   scale_color_manual(values=c("Non significant"="grey30", 
                             "PValue < 0.05 & logFC > 0"="#eb5e60", 
                             "PValue < 0.05 & logFC < 0"="#62a0ca", 
                             "FDR < 0.05 & logFC > 0" = "#960304", 
                             "FDR < 0.05 & logFC < 0" = "#01538a"))+
  labs(title= title, y="", x="")+
  theme(plot.title = element_text(size = rel(2), hjust=0.5))+
  theme(legend.text=element_text(size=16))+
  theme(axis.text=element_text(size=14))+
  theme(legend.title=element_blank())+
  theme(axis.text=element_text(size=14))+
  theme(axis.title = element_text(size=14, face = "bold"))+
  theme(legend.position="bottom")+
  coord_cartesian(xlim = c(-4.2, 4.2))+
  geom_hline(yintercept = -log10(0.05), linetype=2)+
   scale_x_continuous(breaks=c(-4, -2, 0, 2, 4), labels=c("<-4","-2", "0", "2", ">4"))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

ggplotly(p) %>%
  layout(legend = list(
      orientation = "v",
      y = -0.1, 
      font=list(
            size=14
        )
    )
  )
}


#################  Version capped at different y axis levels ###############

volcano_plot_text_3 <- function(x, title) {

#This is nicely dealing with datasets with missing cathegories.
test <- ifelse(x$PValue, "Non significant")
test <- ifelse(x$logFC > 0 & x$PValue <0.05, "PValue < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$PValue <0.05, "PValue < 0.05 & logFC < 0", test)
test <- ifelse(x$logFC > 0 & x$FDR <0.05, "FDR < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$FDR <0.05, "FDR < 0.05 & logFC < 0", test)


plotDat <- data.frame(x, Group=test)

plotDat$logFC <- ifelse(plotDat$logFC > 4, 4, plotDat$logFC)
plotDat$logFC <- ifelse(plotDat$logFC < -4, -4, plotDat$logFC)

plotDat$log10_PValue <- -log10(plotDat$PValue)

plotDat$log10_PValue <- ifelse(plotDat$log10_PValue > 8, 8, plotDat$log10_PValue)

#scale_fill_manual(values=c("#1F78B4" - blue, "#62a0ca" - lighter blue, "#E31A1C" - red, "#eb5e60" - lighter red))+

p <- ggplot(plotDat, aes(x = logFC, y=log10_PValue, fill=Group, col = Group)) +
  geom_point(aes(text=gene_name), size=0.5, pch=21, alpha=0.7, stroke = 0.5)+
  theme_light()+
  scale_fill_manual(values=c("Non significant"="grey30", 
                             "PValue < 0.05 & logFC > 0"="#eb5e60", 
                             "PValue < 0.05 & logFC < 0"="#62a0ca", 
                             "FDR < 0.05 & logFC > 0" = "#960304", 
                             "FDR < 0.05 & logFC < 0" = "#01538a"))+
   scale_color_manual(values=c("Non significant"="grey30", 
                             "PValue < 0.05 & logFC > 0"="#eb5e60", 
                             "PValue < 0.05 & logFC < 0"="#62a0ca", 
                             "FDR < 0.05 & logFC > 0" = "#960304", 
                             "FDR < 0.05 & logFC < 0" = "#01538a"))+
  labs(title= title, y="", x="")+
  theme(plot.title = element_text(size = rel(2), hjust=0.5))+
  theme(legend.text=element_text(size=16))+
  theme(axis.text=element_text(size=14))+
  theme(legend.title=element_blank())+
  theme(axis.text=element_text(size=14))+
  theme(axis.title = element_text(size=14, face = "bold"))+
  theme(legend.position="bottom")+
  coord_cartesian(xlim = c(-4.2, 4.2))+
  expand_limits(x = c(-4.2, 4.2))+
  coord_cartesian(ylim = c(0, 8))+
  geom_hline(yintercept = -log10(0.05), linetype=2)+
  scale_x_continuous(breaks=c(-4, -2, 0, 2, 4), labels=c("<-4","-2", "0", "2", ">4"))+
  scale_y_continuous(breaks=c(0, 2, 4, 6, 8), labels=c("0","2", "4", "6", ">8"))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

ggplotly(p) %>%
  layout(legend = list(
      orientation = "v",
      y = -0.1, 
      font=list(
            size=14
        )
    )
  )
}

E12.5

p <- volcano_plot_text_3(E12.5_GLM, "E12.5")
p
datatable(E12.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))

E14.5

p <- volcano_plot_text_3(E14.5_GLM, "E14.5")
p
datatable(E14.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))

E17.5

p <- volcano_plot_text_2(E17.5_GLM, "E17.5")
p
datatable(E17.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))

P0

p <- volcano_plot_text_3(E19.5_GLM, "P0")
p
datatable(E19.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))

Heatmap of DE genes

Figure 1d

#Build dataframe of logFC values
padding <- 0
a <- data.frame("gene_name"=E12.5_GLM$gene_name, "E12.5_logFC"=E12.5_GLM$logFC+padding)
b <- data.frame("gene_name"=E14.5_GLM$gene_name, "E14.5_logFC"=E14.5_GLM$logFC+padding)
c <- data.frame("gene_name"=E17.5_GLM$gene_name, "E17.5_logFC"=E17.5_GLM$logFC+padding)
d <- data.frame("gene_name"=E19.5_GLM$gene_name, "E19.5_logFC"=E19.5_GLM$logFC+padding)

e <- merge(a, b, by="gene_name")
f <- merge(c,d, by="gene_name")
df <- merge(e,f, by="gene_name")




FDR_threshold <- 0.05
logFC_threshold <- 1

a <- dplyr::filter(E12.5_GLM, FDR < FDR_threshold & 
              logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
b <- dplyr::filter(E14.5_GLM, FDR < FDR_threshold & 
              logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
c <- dplyr::filter(E17.5_GLM, FDR < FDR_threshold & 
              logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
d <- dplyr::filter(E19.5_GLM, FDR < FDR_threshold & 
              logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name

DE_genes_logFC_1_20_top <- unique(c(a, b, c, d))

#length(DE_genes_logFC_1_20_top)

#Filter DE data frame
df_1 <- dplyr::filter(df, gene_name %in% DE_genes_logFC_1_20_top)

rownames(df_1) <- df_1$gene_name
df_1$gene_name <- NULL

colnames(df_1) <- c("E12.5", "E14.5", "E17.5", "P0")

rownames(df_1) <- NULL
df_1 <- df_1[,c(1:4)]

paletteLength <- 100
test <- as.matrix(df_1)

myBreaks <- c(seq(min(test), 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(max(test)/paletteLength, max(test), length.out=floor(paletteLength/2)))

myColor <- colorRampPalette(c("#9E0041", "#fea11d","#fea11d", "white", "#43abea", "#0a48aa", "#13008e"))(paletteLength)


Fig_1d <- pheatmap(as.matrix(df_1), 
         clustering_distance_rows="correlation", 
         cluster_rows = T, 
         cluster_cols = F, 
         legend = T,
         angle_col = 0,
         legend_breaks = c(-4, -2, 0, 2, 4, max(df_1)),
         legend_labels = c("-4", "-2", "0", "2", "4","log2FC"),
         fontsize_row  = gene_name_font_size, 
         color = rev(myColor), 
         breaks = myBreaks,
         main = bquote(atop("Heatmap of DE genes",
                       "FDR < 0.05 and log" [2] *"FC >1 or <-1")))
        

Fig_1d

SFARI gene enrichment among DE genes

SFARI_genes <- read.csv("./GEO_submission files/SFARI-Gene_genes_01-15-2019release_02-26-2019export.csv")

#head(SFARI_genes)

#We are using only high confidence genes associated with autism
SFARI_genes_filtered <- filter(SFARI_genes, gene.score <3)


require("biomaRt")

#Function translating human to mouse orthologs
humanToMouse <- function(x){
  
  human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
  mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
  
  # comparison between mouse and human, returns mouse gene equivalents
  genes = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)
  
  mousex <- unique(genes[, 2])
  return(mousex)
}

#Returning a df matching orthologs
humanToMouse_2 <- function(x){
  
  human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
  mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
  
  # comparison between mouse and human, returns mouse gene equivalents
  genes = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)
  
  mousex <- genes
  return(mousex)
}

#Translating orthologs
SFARI_genes_mouse <- humanToMouse_2(SFARI_genes$gene.symbol)
#SFARI_genes_mouse

SFARI_genes_with_ortho <- merge(SFARI_genes, SFARI_genes_mouse, by.x= "gene.symbol", by.y = "HGNC.symbol")

#making sure still have all the 87 SFARI genes with gene.score 1 and 2
x <- filter(SFARI_genes_with_ortho, gene.score %in% c(1,2))[,c(1,9)]


SFARI_df_to_save <- filter(SFARI_genes_with_ortho, gene.score %in% c(1,2))[ ,c(1, 3:9)]
colnames(SFARI_df_to_save)[8] <- "MGI.symbol_ortholog"

#write.csv(SFARI_df_to_save, file="Supplementary Table 1, #High_confidence_SFARI_mouse_orthologs.csv",row.names = F)

#length(unique(x$gene.symbol)) # 87 Human SFARI genes
#length(unique(x$MGI.symbol))  # 89 mouse orthologs;  MSNP1AS doesn't have an ortholog; DDX3X has 2 orthologs, SRCAP has 2 orthologs too. 

#setdiff(SFARI_genes_filtered$gene.symbol, SFARI_genes_with_ortho$gene.symbol)

#### Circular enrichment plot and permutation test enrichment analysis#### 
`%notin%` <- Negate(`%in%`)

SFARI_DE_df_annotation <- function(x){

cat_1 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 1)$MGI.symbol)
cat_2 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 2)$MGI.symbol)
cat_3 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 3)$MGI.symbol)
cat_4 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 4)$MGI.symbol)
cat_5 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 5)$MGI.symbol)
cat_6 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 6)$MGI.symbol)
cat_NA <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == "NA")$MGI.symbol)
cat_Non_Ausitm <- filter(x, gene_name %notin% filter(SFARI_genes_with_ortho, gene.score %in% c(1, 2,3,4,5,6, "NA"))$MGI.symbol)

cat_1$SFARI_category <- rep(1, nrow(cat_1))
cat_2$SFARI_category <- rep(2, nrow(cat_2))
cat_3$SFARI_category <- rep(3, nrow(cat_3))
cat_4$SFARI_category <- rep(4, nrow(cat_4))
cat_5$SFARI_category <- rep(5, nrow(cat_5))
cat_6$SFARI_category <- rep(6, nrow(cat_6))
cat_NA$SFARI_category <- rep(NA, nrow(cat_NA))
cat_Non_Ausitm$SFARI_category <- rep("Non_Ausitm", nrow(cat_Non_Ausitm))

test <- rbind(cat_1, cat_2, cat_3, cat_4, cat_5, cat_6, cat_NA, cat_Non_Ausitm)

test$Up_or_Down <- ifelse(test$logFC > 0, "Upregulated", "Downregulated")

test

}

E12.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E12.5_GLM)
E14.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E14.5_GLM)
E17.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E17.5_GLM)
E19.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E19.5_GLM)


E12.5_GLM_SFARI_cat$DPC <- rep("E12.5", nrow(E12.5_GLM_SFARI_cat))
E14.5_GLM_SFARI_cat$DPC <- rep("E14.5", nrow(E14.5_GLM_SFARI_cat))
E17.5_GLM_SFARI_cat$DPC <- rep("E17.5", nrow(E17.5_GLM_SFARI_cat))
E19.5_GLM_SFARI_cat$DPC <- rep("E19.5", nrow(E19.5_GLM_SFARI_cat))

#Checking the number of SFARI genes with gene.score 1 or 2 at each timepoint
#filter(E12.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
#filter(E14.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
#filter(E17.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
#filter(E19.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes


SFARI_genes_filtered_mouse <- x$MGI.symbol

#Double checking how many of the SFARI orthologs overlap with genes in DE analysis
#length(unique(SFARI_genes_filtered_mouse)) #There are 89 mouse orthologs of gene.score 1 and 2 SFARI genes

#filter(E12.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#filter(E14.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#filter(E17.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#filter(E19.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#The missing 7 genes are just not sufficiently expressed. 


##### Stacked circular bar plot - the one in figure 2 #####
df <- rbind(E12.5_GLM_SFARI_cat, E14.5_GLM_SFARI_cat, E17.5_GLM_SFARI_cat, E19.5_GLM_SFARI_cat)

# Filtering SFARI genes with gene.score 1 and 2
df_test <- filter(df, SFARI_category %in% c(1,2))


#I'm replacing the real values of logFC to 1 and -1 to build a stacked bar plot
df_test$logFC <- ifelse(df_test$logFC > 0, 1, -1)

#Annotates if a gene is significant
df_test$Sig <- as.character(ifelse(df_test$PValue < 0.05, "DE", "Non-DE"))

#A combination fo DPC and significance
df_test$DPC_Sig <- paste(df_test$DPC, df_test$Sig, sep="_")

my_col <- brewer.pal(n = 8, name = "Set1")

#head(df_test)


#Add Up or down
 df_test$dir <- ifelse(df_test$logFC > 0, "Up", "Down")  # Redundant
 df_test$DPC_Sig_dir <- paste(df_test$DPC, df_test$Sig, df_test$dir, sep = "_")

 df_test <- arrange(df_test, gene_name)

#Yes, I checked the gene names are the same in the columns
#all(filter(df_test, DPC == "E12.5")$gene_name == filter(df_test, DPC == "E14.5")$gene_name)
#all(filter(df_test, DPC == "E17.5")$gene_name == filter(df_test, DPC == "E19.5")$gene_name)
#all(filter(df_test, DPC == "E12.5")$gene_name == filter(df_test, DPC == "E17.5")$gene_name)

## A better matrix for clustering with 0 of non-DE logFC values
df_test$logFC_sig <- ifelse(df_test$PValue > 0.05, 0, df_test$logFC)
  
df_matrix <- data.frame("gene_name" = filter(df_test, DPC == "E12.5")$gene_name,
           "E12.5" = filter(df_test, DPC == "E12.5")$logFC_sig,
           "E14.5" = filter(df_test, DPC == "E14.5")$logFC_sig,
           "E17.5" = filter(df_test, DPC == "E17.5")$logFC_sig,
           "E19.5" = filter(df_test, DPC == "E19.5")$logFC_sig)  

#head(df_matrix)

rownames(df_matrix) <- df_matrix$gene_name
df_matrix$gene_name <- NULL

# Dissimilarity matrix
d <- dist(df_matrix, method = "euclidean")

# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "ward.D2" )

# Plot dendrogram
#plot(hc1, cex = 0.6, hang = -1)
 
#head(df_test)
 
df_test$gene_name <- factor(df_test$gene_name, levels = unique(df_test$gene_name)[hc1$order])   
df_test <- arrange(df_test, gene_name) 

### Angle of labels
step <- 360/length(unique(df_test$gene_name))
myAng <- seq(0, 360, step)
myAng <- rev(myAng)
myAng <- myAng - 90
myAng <- ifelse(myAng > 90 & myAng < 270, myAng-180, myAng)

#Adding FDR
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E12.5", "E12.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E14.5", "E14.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E17.5", "E17.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E19.5", "E19.5_Down_FDR", df_test$DPC_Sig_dir)

df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E12.5", "E12.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E14.5", "E14.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E17.5", "E17.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E19.5", "E19.5_Up_FDR", df_test$DPC_Sig_dir)



df_test$Sig_dir <- ifelse(df_test$PValue < 0.05 & df_test$logFC == -1, "P_Down", "Non-Sig")
df_test$Sig_dir <- ifelse(df_test$PValue < 0.05 & df_test$logFC == 1, "P_Up", df_test$Sig_dir)
df_test$Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1, "P_Down", df_test$Sig_dir)
df_test$Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1, "P_Up", df_test$Sig_dir)


##### 

Fig1g <- ggplot(df_test, aes(x=gene_name, y = abs(logFC), fill=DPC_Sig_dir))+
  geom_bar(stat = "identity", alpha = 1, position = "stack")+
  annotate("rect", xmin=0, xmax=Inf, ymin=-1, ymax=0, alpha=1, fill="white")+
  annotate("rect", xmin=0, xmax=Inf, ymin=0, ymax=1, alpha=0.5, fill="grey10")+
  annotate("rect", xmin=0, xmax=Inf, ymin=1, ymax=2, alpha=0.5, fill="grey30")+
  annotate("rect", xmin=0, xmax=Inf, ymin=2, ymax=3, alpha=0.5, fill="grey60")+
  annotate("rect", xmin=0, xmax=Inf, ymin=3, ymax=4, alpha=0.5, fill="grey90")+
  geom_bar(stat = "identity", alpha = 1, position = "stack")+
  coord_polar()+
  theme_minimal()+
  geom_hline(yintercept = 0, size=1, linetype=1)+
  geom_hline(yintercept = 1, size=1, linetype=1)+
  geom_hline(yintercept = 2, size=1, linetype=1)+
  geom_hline(yintercept = 3, size=1, linetype=1)+
  geom_hline(yintercept = 4, size=1, linetype=1)+
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())+
  labs(x="", y="")+
  theme(legend.position = "none")+
  scale_fill_manual(values = c(
    "E12.5_DE_Up"=  "#eb8889",
    "E14.5_DE_Up" = "#eb8889",
    "E17.5_DE_Up" = "#eb8889",
    "E19.5_DE_Up" = "#eb8889",
    "E12.5_Non-DE_Up"=  NA,
    "E14.5_Non-DE_Up" = NA,
    "E17.5_Non-DE_Up" = NA,
    "E19.5_Non-DE_Up" = NA,
    "E12.5_DE_Down"=  "#83aecc",
    "E14.5_DE_Down" = "#83aecc",
    "E17.5_DE_Down" = "#83aecc",
    "E19.5_DE_Down" = "#83aecc",
    "E12.5_Non-DE_Down"=  NA,
    "E14.5_Non-DE_Down" = NA,
    "E17.5_Non-DE_Down" = NA,
    "E19.5_Non-DE_Down" = NA,
    ############################
    "E12.5_Down_FDR" = my_col[2],
    "E14.5_Down_FDR" = my_col[2],
    "E17.5_Down_FDR" = my_col[2],
    "E19.5_Down_FDR" = my_col[2],
    
    "E12.5_Up_FDR" = my_col[1],
    "E14.5_Up_FDR" = my_col[1],
    "E17.5_Up_FDR" = my_col[1],
    "E19.5_Up_FDR" = my_col[1]

  ))+
  ylim(-1,4)+
  theme(axis.text.x = element_text(angle = myAng, size =13, color="black"))+
  theme(panel.border = element_blank(),
        legend.key = element_blank(),
        axis.ticks = element_blank(),
        axis.text.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())+
  annotate(geom = "text", x = 72, y = 0.5, label = "P0", color = "black",
             angle = 50, size =10)+
  annotate(geom = "text", x = 72, y = 1.46, label = "E17.5", color = "black",
             angle = 50, size =10)+
  annotate(geom = "text", x = 72, y = 2.47, label = "E14.5", color = "black",
             angle = 50, size =10)+
 annotate(geom = "text", x = 72, y = 3.47, label = "E12.5", color = "black",
             angle = 50, size =10)

Fig1g

SFARI genes enrichment statistical tests

Permutation test

##### Permutation test per GOmpers et al. #####
# Code adopted from Alex's analysis

#All genes in the background

#Enrichment fo SFARI genes among DE genes.
permutation_test <-  function(x, plot.name){
# x <- E17.5_GLM_module_SFARI_cat
  
binary.score.reference <- ifelse(x$gene_name %in% filter(x, SFARI_category %in% c(1,2))$gene_name, 1,0)

binary.score.test <- ifelse(filter(x, PValue < 0.05)$gene_name %in% filter(x, PValue <0.05, SFARI_category %in% c(1,2))$gene_name, 1,0)


geneset.perm.test <- function(binary.score.reference, binary.score.test, iterations, plot.name) {
  
  binary.score.reference <- ifelse(is.na(binary.score.reference),0,binary.score.reference)
  binary.score.test <- ifelse(is.na(binary.score.test),0,binary.score.test)
  count.criteria <- vector(length=iterations)
  test.size <- length(binary.score.test)
  for (index in 1:iterations) {
    count.criteria[index] <- sum(sample(binary.score.reference, test.size, replace = F))
  }
  x.min <- min(c(count.criteria, sum(binary.score.test))) - 20
  if (x.min < 0) {x.min <- 0}
  x.max<- max(c(count.criteria, sum(binary.score.test))) + 20
  z <- abs(mean(count.criteria) - sum(binary.score.test))/sd(count.criteria)
  p <- 2*pnorm(-abs(z))
  e <- sum(binary.score.test)/mean(count.criteria)
  prop <- sum(binary.score.test)/length(binary.score.test)
  bg <- length(binary.score.reference)
  
  hist(count.criteria, col="gray", xlab="Count", ylab="Frequency", main=paste(plot.name, " \ncount=", sum(binary.score.test), " of ",  length(binary.score.test), "; p-value=", format(p, scientific = T, digits = 2), "; FE=", format(e, scientific = F, digits = 2), "; Prop=", format(prop, scientific = F, digits = 2), "; n(bg)=", bg, sep=""), xlim=c(x.min,x.max), cex=.75)
  
  abline(v=sum(binary.score.test), lwd=3, col="red")
  #return(list(count.criteria,z,p,e,prop,bg,sum(binary.score.test)))
}

geneset.perm.test(binary.score.reference, binary.score.test, 100000, plot.name)

}

par(mfrow=c(2,2))

perm_E12.5 <- permutation_test(E12.5_GLM_SFARI_cat, "E12.5")
perm_E14.5 <- permutation_test(E14.5_GLM_SFARI_cat, "E14.5")
perm_E17.5 <- permutation_test(E17.5_GLM_SFARI_cat, "E17.5")
perm_P0 <- permutation_test(E19.5_GLM_SFARI_cat, "P0")

Permutation and hypergeometric test P value table

#Hypergeometric test
SFARI_genes_hypergeometric_test <- function(x){ 
#Anotations on the right refer to the post in the link

#https://stats.stackexchange.com/questions/16247/calculating-the-probability-of-gene-list-overlap-between-an-rna-seq-and-a-chip-c
#x <- E17.5_GLM_module_SFARI_cat
  
  n <- length(as.character(x$gene_name))   #Total gene population 15k
  a <- length(filter(x, PValue < 0.05)$gene_name)  #RNA Seq enriched # 3000
  b <- length(filter(x, SFARI_category %in% c(1,2))$gene_name) #Enriched by Chip-Seq 400
  t <- length(filter(x, PValue < 0.05, SFARI_category %in% c(1,2))$gene_name) #Intersected 100

hypergeometric_test_p_value <- sum(dhyper(t:b, a, n - a, b))
hypergeometric_test_p_value
}

hyper_E12.5 <- SFARI_genes_hypergeometric_test(E12.5_GLM_SFARI_cat)
hyper_E14.5 <- SFARI_genes_hypergeometric_test(E14.5_GLM_SFARI_cat)
hyper_E17.5 <- SFARI_genes_hypergeometric_test(E17.5_GLM_SFARI_cat)
hyper_P0 <- SFARI_genes_hypergeometric_test(E19.5_GLM_SFARI_cat)


knitr::kable(data.frame("Age" = c("E12.5", "E14.5", "E17.5", "P0"), 
           "Permutation test P" = c("5.4e-0.1", "8.7e-01", "7.6e-06", "2.3e-01"), 
           "Hypergeometric test P" = c(hyper_E12.5, hyper_E14.5, hyper_E17.5, hyper_P0)))
Age Permutation.test.P Hypergeometric.test.P
E12.5 5.4e-0.1 0.3202565
E14.5 8.7e-01 0.6048939
E17.5 7.6e-06 0.0000073
P0 2.3e-01 0.9277218

Individual gene expression trajectories

###
# x is a gene name
# y is a plot title, in case there is an alternative gene name

rpkm_box_plot <- function(x, y){
rpkm_test <- as.data.frame(reshape2::melt(rpkm.data_linear_all[x,]))
rpkm_test_w_info <- cbind(rpkm_test, sample.info[,"ExperimentalDesign"])
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"DPC"]))
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"Condition"]))

colnames(rpkm_test_w_info) <- c("RPKM", "treatment", "DPC", "Condition")

j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]

ggplot(rpkm_test_w_info, aes(x = DPC, y= RPKM, colour=Condition))+
  geom_smooth(formula = y ~ x, method = "loess", se=T, aes(fill=Condition,  group=Condition), size  = 0.7, alpha=0)+
  geom_boxplot(alpha=0, position="identity", size = 0.2)+
  geom_jitter(size=2, pch = 19, width = 0.2, alpha = 0.5)+
  theme_bw()+
  theme(axis.text.x=element_text(angle=50, vjust=0.9, hjust=1, size=14))+
  theme(axis.text.y=element_text(size=12))+
  theme(axis.title.y=element_text(size=14))+
  labs(title= y, x="")+
  theme(plot.title = element_text(size = rel(2), hjust=0.5))+
  scale_color_manual(values = j_brew_colors)+
  scale_x_discrete(breaks =c("12.5", "14.5", "17.5", "19.5"), labels = c("E12.5", "E14.5", "E17.5", "P0"))+
  theme(legend.position = "none",
  panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  theme(legend.position = "none")
}

Fig. 3d

genes <- c("Vegfa", "Flt1", "Pdk1", "Ldha", "Bnip3", "Il20rb") 

pl <- lapply(genes, function(x) rpkm_box_plot(x,x))


marrangeGrob(pl, nrow=2, ncol=3, top = "", 
             layout_matrix = rbind(c(1,2,3), c(4,5,6)))

Fig. 4a

genes <- c("Mki67", "Eomes", "Pax6", "Sox9", "Tbr1", "Bcl11b", "Satb2", "Cux1")
gene_names <- c("Mki67 (Ki67)", "Eomes (Tbr2)", "Pax6", "Sox9", "Tbr1", "Bcl11b (Ctip2)", "Satb2", "Cux1")

df_genes <- data.frame(genes, gene_names)

pl <- lapply(1:nrow(df_genes), function(x) rpkm_box_plot(df_genes[x, 1], df_genes[x, 2]))


marrangeGrob(pl, nrow=4, ncol=2, top = "", 
             layout_matrix = rbind(c(1,2), c(3,4), c(5,6), c(7,8)))

Fig. 6q

genes <- c("Gfap", "Olig2", "Dlx2", "Gad1") 

pl <- lapply(genes, function(x) rpkm_box_plot(x,x))

marrangeGrob(pl, nrow=2, ncol=3, top = "", 
             layout_matrix = rbind(c(1,2), c(3,4)))

Fig. 7d

genes <- c("Mki67", "Eomes", "Pax6", "Sox9")
gene_names <- c("Mki67 (Ki67)", "Eomes (Tbr2)", "Pax6", "Sox9")

df_genes <- data.frame(genes, gene_names)

pl <- lapply(1:nrow(df_genes), function(x) rpkm_box_plot(df_genes[x, 1], df_genes[x, 2]))


marrangeGrob(pl, nrow=2, ncol=2, top = "", 
             layout_matrix = rbind(c(1,2), c(3,4)))